library(readr)
library(ggplot2)
library(tidyr)
library(dplyr)
library(lubridate)
library(stringr)
library(jhur)

Read in Data

Read in the charm city circulator dataset:

circ = read_csv("http://johnmuschelli.com/intro_to_r/data/Charm_City_Circulator_Ridership.csv") or circ = read_circulator()

circ = read_csv("http://johnmuschelli.com/intro_to_r/data/Charm_City_Circulator_Ridership.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   day = col_character(),
##   date = col_character(),
##   orangeBoardings = col_double(),
##   orangeAlightings = col_double(),
##   orangeAverage = col_double(),
##   purpleBoardings = col_double(),
##   purpleAlightings = col_double(),
##   purpleAverage = col_double(),
##   greenBoardings = col_double(),
##   greenAlightings = col_double(),
##   greenAverage = col_double(),
##   bannerBoardings = col_double(),
##   bannerAlightings = col_double(),
##   bannerAverage = col_double(),
##   daily = col_double()
## )
# covert dates
circ = mutate(circ, date = mdy(date))
# change colnames for reshaping
colnames(circ) =  colnames(circ) %>% 
  str_replace("Board", ".Board") %>% 
  str_replace("Alight", ".Alight") %>% 
  str_replace("Average", ".Average") 

# make long
long = pivot_longer(circ, 
                    starts_with(c("orange","purple","green","banner")),
                    names_to = "var", values_to = "number")
# separate
long = separate(long, var, into = c("route", "type"), 
    sep = "[.]")

or run:

long = read_circulator_long() %>% 
  rename(route = line)
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   day = col_character(),
##   date = col_character(),
##   orangeBoardings = col_double(),
##   orangeAlightings = col_double(),
##   orangeAverage = col_double(),
##   purpleBoardings = col_double(),
##   purpleAlightings = col_double(),
##   purpleAverage = col_double(),
##   greenBoardings = col_double(),
##   greenAlightings = col_double(),
##   greenAverage = col_double(),
##   bannerBoardings = col_double(),
##   bannerAlightings = col_double(),
##   bannerAverage = col_double(),
##   daily = col_double()
## )
## take just average ridership per day
avg = filter(long, type == "Average")
avg = filter(avg, !is.na(number))

# separate
type_wide = pivot_wider(long, 
                        names_from = "type", values_from = "number")
head(type_wide)
## # A tibble: 6 x 7
##   day       date       daily route  Boardings Alightings Average
##   <chr>     <date>     <dbl> <chr>      <dbl>      <dbl>   <dbl>
## 1 Monday    2010-01-11  952  orange       877       1027    952 
## 2 Tuesday   2010-01-12  796  orange       777        815    796 
## 3 Wednesday 2010-01-13 1212. orange      1203       1220   1212.
## 4 Thursday  2010-01-14 1214. orange      1194       1233   1214.
## 5 Friday    2010-01-15 1644  orange      1645       1643   1644 
## 6 Saturday  2010-01-16 1490. orange      1457       1524   1490.

Part 1

In these questions, try to use ggplot2 if possible.

  1. Plot average ridership (avg data set) by date.
q = qplot(x = date, y = number, data = avg)
q + xlim(ymd("2011/05/03", "2012/06/04"))
## Warning: Removed 1871 rows containing missing values (geom_point).

g = ggplot(avg, aes(x = date, y = number))
g + geom_point()

g + geom_point() + xlim(ymd("2011/05/03", "2012/06/04"))
## Warning: Removed 1871 rows containing missing values (geom_point).

  1. Color the points by route (orange, purple, green, banner)
qplot(x = date, y = number, data = avg, colour = route)

first_plot = qplot(x = date, y = number, data = avg, colour = route)
print(first_plot)

g = ggplot(avg, aes(x = date, y = number, color = route))
g + geom_point()

  1. add black smoothed curves for each route
qplot(x = date, y = number, data = avg, colour = route) + geom_smooth(aes(group = route), colour= "black")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

qplot(x = date, y = number, data = avg, colour = route) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

g + geom_point() + geom_smooth(color="black")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

g + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

  1. Color the points by day of the week
qplot(x = date, y = number, data = avg, colour = day)

qplot(x = date, y = number, data = avg, colour = route) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

avg = avg %>% mutate(dayFactor = factor(day, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")))
g = ggplot(avg, aes(x = date, y = number, color = dayFactor))
g + geom_point()

  1. Replot 1a where the colors of the points are the name of the route (with banner –> blue)
pal = c(banner = "blue", purple = "purple", green=  "darkgreen", orange = "orange")
qplot(x = date, y = number, data = avg, colour = route) +
      scale_colour_manual(values = pal)

g = ggplot(avg, aes(x = date, y = number, color = route))
g + geom_point() + scale_colour_manual(values = pal)

Part 2

  1. plot average ridership by date with one panel per route
qplot(x = date, y = number, data= avg, facets = ~route) 

qplot(x = date, y = number, data= avg) +
  facet_wrap( ~ route)

qplot(x = date, y = number, data= avg) +
  facet_wrap( ~ route, ncol =4)

qplot(x = date, y = number, data= avg, facets = ~route, colour = route) +  scale_colour_manual(values=pal)

g = ggplot(avg, aes(x = date, y = number, color = route))
g + geom_point() +  facet_wrap( ~ route) + scale_colour_manual(values=pal)

  1. Plot average ridership by date with separate panels by day of the week, colored by route
qplot(x = date, y = number, data= avg, facets = ~day,
    colour = route) +  scale_colour_manual(values=pal)

ggplot(aes(x = date, y = number, colour = route), data= avg) + 
  geom_point() + 
  facet_wrap( ~day) +  scale_colour_manual(values=pal)

Part 3

  1. Plot average ridership (avg) by date, colored by route (same as 1a). (do not take an average, use the average column for each route) Make the x-label "Year" Make the y-label "Number of People" Use the black and white theme theme_bw() Change the text_size to (text = element_text(size = 20)) in theme
first_plot = ggplot(avg, aes(x = date, y = number, color = route)) + geom_point() + scale_colour_manual(values=pal)


first_plot  +
  xlab("Year") + ylab("Number of People") + theme_bw() + 
  theme(text = element_text(size = 20))

  1. Plot average ridership on the orange route versus date as a solid line, and add dashed “error” lines based on the boardings and alightings. the line colors should be orange. (hint linetype is an aesthetic for lines) - see also scale_linetype and scale_linetype_manual Alightings = "dashed", Boardings = "dashed", Average = "solid"
orange = long %>% filter(route == "orange")

line type is dashed

ggplot(orange, aes(x = date, y = number)) + 
  geom_line(linetype = "dashed", colour ="orange")

dont do this, trying to find column named orange

ggplot(orange, aes(x = date, y = number)) + 
  geom_line(linetype = "dashed", aes(colour="orange"))

now line dashedness varies by the type

ggplot(orange, aes(x = date, y = number)) + 
  geom_line(aes(linetype = type), colour = "orange")

this one as a quick plot

qplot(data = orange, x = date, y = number,
      linetype = type, geom = "line", colour = "orange")

ggplot(orange, aes(x = date, y = number)) + 
  geom_line(aes(linetype = type), colour = "orange") + 
  scale_linetype_manual(values = c("dashed",
              "dashed", "solid"))

ggplot(orange, aes(x = date, y = number)) + 
  geom_line(aes(linetype = type), colour = "orange") + 
  scale_linetype_manual(
      values = c(Alightings = "dashed",
             Boardings = "dashed", 
             Average = "solid"))